RNA-seq analysis workflow

Code Backup

Back to Main Page

Preparation

Load packages

library(dplyr)
library(ggplot2)
library(DESeq2)
library(edgeR)
library(reshape)
library(dplyr)

Sample data

Data import (count matrix).
Data import (TPM).

TPM calculation manually

To get GTF file, visit the following link :
https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/ Human
https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/genes/ Mouse

library(rtracklayer)
library(GenomicFeatures)
library(dplyr)

dir = "path/ref/"

gtf_path <- paste0(dir,"hg38_gtf_ucsc/hg38.refGene.gtf")
gtf <- rtracklayer::import(gtf_path)

txdb <- makeTxDbFromGFF(gtf_path, format="gtf")

# Gene information (start and end) 
genes <- genes(txdb)

# Create data frame 
gene_lengths <- with(as.data.frame(genes), {
  gene_id = mcols(genes)$gene_id
  start = start
  end = end
  width = end - start + 1
  data.frame(gene_id, start, end, width)
})


# head(gene_lengths)
# gene_lengths %>% dim()

# Check the data manually  
gs = c("TACSTD2","EGFR") 
gene_lengths[gene_lengths$gene_id %in% gs,]

# Save output 
gene_lengths %>% write.csv(paste0(dir,"hg38_refGene_geneLength.csv"))

Calculate TPM sample by sample

for(i in 1:ncol(count_mtx)){   
  rpkm = count_mtx[,i]/(gene_length/1000)   
  rpkm_sum = sum(rpkm)   
  tpm <- rpkm / rpkm_sum * 1e6   
  tpm_all[,i] = tpm   
}  

Principal component analysis

ggplot(pcaData, aes(x = PC1, y = PC2, fill = group)) +
  geom_point(size = 4, alpha = 0.6, shape = 21, color = "black", stroke = 0.5)  +
  ggrepel::geom_text_repel(aes(label=name), 
                           color="grey6", size=3, hjust= -0.3, vjust=-0.3) +
  labs(x = paste("PC1: ", round(100 * PCA_var[1]), "% variance"),
       y = paste("PC2: ", round(100 * PCA_var[2]), "% variance")) +
  theme_bw() +
  theme(legend.title = element_blank()) +
  ggtitle("PCA") +
  labs(caption = " ")

Differentially Expressed Genes (DEG) analysis

DEG data

# Generate info table
info <- data.frame(matrix(nrow = ncol(count.mtx), ncol = 2))
colnames(info) <- c('sample', 'cond')
info$sample <- colnames(count.mtx)
info$cond <- "sample information here"

# DESeq
dds <- DESeqDataSetFromMatrix(count.mtx, info, ~ cond)
dds <- DESeq(dds)
res <- results(dds)

Visualization of DEGs by Volcanoplot

Traditional version

res = res %>% mutate(DE=ifelse(log2FoldChange >= log2(fc) & padj < pval, 'UP',
                               ifelse(log2FoldChange <= -log2(fc) & padj < pval, 'DN','no_sig')))
res$DE = factor(res$DE, levels = c('UP','DN','no_sig'))

res %>% 
  ggplot(aes(log2FoldChange, -log10(padj), color=DE)) + 
  geom_point(size=1, alpha=0.5) + 
  scale_color_manual(values = c('red','blue','grey')) +
  theme_classic() +
  geom_vline(xintercept = c(-log2(fc),log2(fc)), color='grey') +
  geom_hline(yintercept = -log10(0.05),color='grey') +
  guides(colour = guide_legend(override.aes = list(size=5))) +
  ggtitle("Resist/Naive") +
  ggeasy::easy_center_title() ## to center title

Volcanoplot (other versions)

Three specific gene groups : black, red, blue
Dots in the plot are colored by the groups.
Axis label represents mathematical expressions.


Prepare data

library(cowplot)
library(dplyr)
library(ggplot2)
library(DESeq2)

dir= '~/Desktop/DF/DFCI_Barbie/DFCI_Barbie_NK_Marco/'

res = read.csv(paste0(dir,"info/published_RNA_seq/DEG_Deseq2_removedSamples_Low_vs_High.csv"), check.names = F)

# Prepare data 
res$color = 'not_selected'

gene_vector <- c("HLA-A", "HLA-B", "B2M", "TAP1", "AXL","CXCL11", "CXCL10", "CXCL9", "CCL5", "CCL2")
gene_vector= gene_vector[gene_vector %in% res$Gene]
res[res$Gene %in% gene_vector,]$color = "selected"

res$label = ''
res[res$Gene %in% gene_vector,]$label = res[res$Gene %in% gene_vector,]$Gene
# Define gene categories with specific biological functions
black <- c("HLA-A", "HLA-B", "B2M", "TAP1", "CXCL9", "CXCL10", "CXCL11", "CCL2", "CCL5") 
black <- black[black %in% res$Gene] 
red <- c("IL2", "IL12A", "IL15", "IL18", "IL21", "IL23A", "IFNA", "IFNB")
red <- red[red %in% res$Gene] 
blue <- c("TGFB1", "IL10", "IL6")
blue <- blue[blue %in% res$Gene] 

# Initialize a label column in the dataset
res$label <- ''
# Update label column to contain gene names for specified categories
res[res$Gene %in% union(black, union(red, blue)),]$label <- res[res$Gene %in% union(black, union(red, blue)),]$Gene

# Assign colors to genes based on their categories
res <- res %>% mutate(label_color = ifelse(Gene %in% black, "black",
                                           ifelse(Gene %in% red, "red",
                                                  ifelse(Gene %in% blue, "blue", "other"))))
# Convert label_color to a factor with specific level order
res$label_color <- factor(res$label_color, levels = c("black", "red", "blue", "other"))

Volcanoplot 1

res %>% 
  ggplot(aes(-log2FoldChange, -log10(padj), color = label_color, alpha = label_color)) + 
  geom_point(data = . %>% filter(!label_color %in% c("black", "red", "blue")),
             size = 1, alpha = 0.3, show.legend = F) +
  geom_point(data = . %>% filter(label_color %in% c("black", "red", "blue")),
             size = 1, show.legend = F) +
  scale_color_manual(values = c("grey","black","red","blue")) +
  scale_alpha_manual(values = c(1, 1, 1, 0.3)) +
  ggrepel::geom_text_repel(data = . %>% filter(label_color %in% c("black", "red", "blue")), 
                           aes(label = label, color = label_color), 
                           size = 4, max.overlaps = Inf, show.legend = FALSE) +
  theme_classic() +
  xlab(expression(Log[2]*" (Fold Change)")) +  
  ylab(expression(-Log[10]*" (padj)")) +
  theme(axis.text = element_text(size = 14)) 

Volcanoplot 2

With guidelines for significance in the plot and x axis limit

# Plotting
res %>% 
  ggplot(aes(-log2FoldChange, -log10(padj), 
             color = label_color, alpha = label_color)) + 
  geom_point(data = . %>% filter(!label_color %in% c("black", "red", "blue")),
             size = 1, alpha = 0.3, show.legend = FALSE) +
  # Plot highlighted genes
  geom_point(data = . %>% filter(label_color %in% c("black", "red", "blue")),
             size = 1, show.legend = FALSE) +
  scale_color_manual(values = c("grey", "black", "red", "blue")) +
  # Set the alpha values for points
  scale_alpha_manual(values = c(1, 1, 1, 0.3)) +
  # Add labels to highlighted genes using ggrepel to avoid overlap
  ggrepel::geom_text_repel(data = . %>% filter(label_color %in% c("black", "red", "blue")), 
                           aes(label = label, color = label_color), 
                           size = 4, max.overlaps = Inf, show.legend = FALSE) +
  theme_classic() + 
  xlab(expression(Log[2]*" (Fold Change)")) +  
  ylab(expression(-Log[10]*" (padj)")) +
  theme(axis.text = element_text(size = 14)) +
  # Define limits and breaks for the x-axis
  xlim(c(-11, 11)) +
  scale_x_continuous(breaks = seq(-10, 10, by = 2), limits = c(-10, 11)) +
  # Add vertical and horizontal dashed lines at significant thresholds
  geom_vline(xintercept = c(-log2(1.5), log2(1.5)), color="grey", linetype = "dashed") +
  geom_hline(yintercept = -log10(0.05), color="grey", linetype = "dashed")

Gene set enrichment analysis

Perform GSEA

library(clusterProfiler)
gobp <- msigdbr::msigdbr(species = "Mus musculus", 
                         category = "C5", subcategory = "GO:BP") %>% 
  dplyr::select(gs_name, gene_symbol)

perform_GSEA <- function(res, ref, pvalueCutoff = 1) {
  ranking <- function(res) {
    df <- res$avg_log2FC
    names(df) <- rownames(res)
    df <- sort(df, decreasing = TRUE)
    return(df)
  }
  
  ranked.res <- ranking(res)
  
  x <- clusterProfiler::GSEA(geneList = ranked.res,
                             TERM2GENE = ref,
                             pvalueCutoff = pvalueCutoff,
                             pAdjustMethod = "BH",
                             verbose = TRUE,
                             seed = TRUE)
  
  result <- x@result %>% arrange(desc(NES))
  result <- result[, c('NES', 'pvalue', 'p.adjust', 'core_enrichment', 'ID')]
  return(result)
}

# Application 
gsea.res = perform_GSEA(res = deg.cluster3, ref = gpbp)

NES plot

# GESA Plot 
gsea_nes_plot =function(gsea.res, title){
  gsea.res %>% ggplot(aes(reorder(ID, NES), NES)) +
    geom_col(aes(fill=p.adjust)) +
    coord_flip() +
    labs(x="Pathway", y="Normalized Enrichment Score",
         title= "GSEA") + 
    theme_classic() +
    scale_fill_gradient(low = 'red', high = '#E5E7E9') +
    theme(axis.text.x= element_text(size=5, face = 'bold'),
          axis.text.y= element_text(size=6, face = 'bold'), 
          axis.title =element_text(size=10)) +ggtitle(title)
}

# Application 
gsea_nes_plot(gsea.res[gsea.res$p.adjust <= 0.05,], title = "Cluster3")

Enrichment plot for the individual pathway (gseaplot2)

# Draw gseaplot 

id1 = "GOBP_NUCLEOSOME_POSITIONING" 
enrichplot::gseaplot2(gsea.res, geneSetID = id1, title = id1)


# Getting the core enrichment genes  
# gsea output : core_enrichment
core_enrichment_genes = gsea.res@result %>% 
  filter(Description == pathway) %>% pull() %>% strsplit("/") %>% unlist()

ssGSEA

Single sample GSEA

Single-sample Gene Set Enrichment Analysis (ssGSEA) is an variation of the GSEA algorithm that instead of calculating enrichment scores for groups of samples (i.e Control vs Disease) and sets of genes (i.e pathways), it provides a score for each each sample and gene set pair.
(https://www.genepattern.org/modules/docs/ssGSEAProjection/4#gsc.tab=0)


tpm = read.csv("../sampledata/tpm.sample_mouse.csv", row.names = 1)
## Perform ssgsea 
library(corto)

## Input data : tpm (count.mtx is accepted as well) 
test = ssgsea(tpm,hallmarkList)

## Reshape it to plot 
test.df = test %>% t() %>% data.frame()
rownames(test.df) = colnames(tpm)

## p value of ssgsea 
pval = corto::z2p(test)
colnames(pval) = rownames(test.df)

ssGSEA heatmap

Color represents enrichment score


## Heatmap 
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(60),
           colorRampPalette(colors = c("white","#D35400"))(70))
t(test.df) %>% pheatmap::pheatmap(color = my.color, 
                                  cluster_rows = F,
                                  cluster_cols = F,
                                  main = "ssGSEA of HALLMARK pathways",
                                  gaps_col = c(3,6)) # Simple heatmap 

Significance by p value Heatmap

Red: p value <= 0.05 (significant)
White : p value > 0.05 (not significant)

## Heatmap 
input.data = (pval<=0.05)
input.data[] <- +input.data
input.data %>% pheatmap::pheatmap(color = c("grey99","salmon"), 
                                  cluster_rows = F,
                                  cluster_cols = F, 
                                  main = "Significance by p value",
                                  gaps_col = c(3,6)) # Simple heatmap 

Single pathway ssGSEA

Pathway sample 1

# Plot it 
genesetname = "WP Apoptosis"
test.df = test.df %>% 
  mutate(significance = ifelse(TCSA_pval <= 0.05, max(test.df[,1]),0))
input = data.frame(test.df[,c(1,3)])
rownames(input) = rownames(test.df)
colnames(input) = c(genesetname, "p value < = 0.05")
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(50),
           colorRampPalette(colors = c("white","#D35400"))(40))
input %>% pheatmap::pheatmap(cluster_rows = F, cluster_cols = F, 
                             color=my.color, border_color = "grey33", 
                             gaps_col = 1, 
                             main = "ssGSEA with significance")

Pathway sample 2

# Plot it 
genesetname = "GPBP DNA Damage"
test.df = test.df %>% 
  mutate(significance = ifelse(TCSA_pval <= 0.05, max(test.df[,1]),0))
input = data.frame(test.df[,c(1,3)])
rownames(input) = rownames(test.df)
colnames(input) = c(genesetname, "p value < = 0.05")
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(50),
           colorRampPalette(colors = c("white","#D35400"))(60))
input %>% pheatmap::pheatmap(cluster_rows = F, cluster_cols = F, 
                             color=my.color, border_color = "grey33", 
                             gaps_col = 1, 
                             main = "ssGSEA with significance")

Multiple genes-violin plot across samples

Define Function

# Import sample data 
tpm.sample = read.csv("../sampledata/tpm.sample.csv", row.names = 1)

cellmarker = read.csv('../info/DB/CellMarker2.0.csv')
celltypes = c("B cell",
              "CD8+ T cell",
              "Activated T cell",
              "Cancer cell",
              "CD4+ T cell",
              "Dendritic cell",
              "Macrophage")

# Functions  
mks.violinplot = function(cellname, tpm=tpm.sample){
  rs <- grepl(cellname, cellmarker$cell_name, ignore.case = TRUE)
  
  mks = cellmarker[rs, ]$marker 
  mks = mks[mks %in% rownames(tpm)]
  
  tpm.df =tpm %>% filter(rownames(.) %in% mks)
  tpm.df$gene = rownames(tpm.df)
  
  p= tpm.df %>% reshape::melt() %>% ggplot(aes(.[,2], log10(.[,3]))) +
    
    geom_violin(aes(fill = variable), alpha = 0.1, show.legend = FALSE) +
    geom_boxplot(outlier.size = 0, width = 0.2, alpha = 0.5, show.legend = FALSE) +
    geom_jitter(aes(color = variable), alpha = 0.5, size = 1, show.legend = FALSE) +
    scale_y_continuous(labels = scales::comma) +
    theme_bw() +
    labs(y = paste0(cellname, "  genes (log10 scaled)"),
         title = paste0(cellname, " signature genes ")) +
    theme(
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 90, hjust=1, size=14),
      panel.grid.major.x = element_blank(),
      axis.title.y = element_text(size = 14),  # y 축 레이블 폰트 크기 설정
      plot.title = element_text(size = 20)
    )
  return(p)
}

Application

mks.violinplot(cellname = "B cell", tpm=tpm.sample)

mks.violinplot(cellname = "Dendritic cell", tpm=tpm.sample)

Correlation calculation

Method: spearman
pearson is available

Define Function

# Read CPDM PDX RNAseq TPM value 
dir = "~/Desktop/DF/DFCI_Paweletz/2023_RNA_seq_PDX/"
count.mtx = read.csv(paste0(dir,"count_matrix/CPDM_PDX_RNAseq_DNAnexus/CPDM_PDX_RNAseq_count.mtx.24.04.07.csv"), row.names = 1)
tpm = read.csv(paste0(dir, "count_matrix/CPDM_PDX_RNAseq_DNAnexus/CPDM_PDX_RNAseq_tpm.24.04.07.csv"), row.names = 1)

# Function 
# Spearman correlation to input gene 
gene_cor_spearman = function(gene, tpm_data=tpm){
  input_exp <- t(tpm_data[gene, , drop = FALSE])  # 전치하여 샘플이 열이 되게 함
  
  # 다른 유전자 데이터 확인
  other_genes <- t(tpm_data)  # 전치하여 샘플이 열이 되게 함
  
  # 스피어만 상관 계수 및 p-value 계산
  results <- WGCNA::corAndPvalue(input_exp, other_genes, use = "pairwise.complete.obs", method = "spearman")
  
  df = results$cor %>% t() %>% data.frame()
  df2 = results$p %>% t() %>% data.frame()
  df = cbind(df,df2)
  colnames(df)[2] = "pvalue"
  return(df)
}

Application

# Apply function 
tacstd2 = gene_cor_spearman(gene = "TACSTD2")

# Save data 
# tacstd2 %>% write.csv(paste0(dir,"data/CPDM_cor_spearman/CPDM_RNASeq_cor_TACSTD2_spearman_allGenes.csv"))

Correlation heatmap

Define Function

# Prepare data 
dir <- "~/Desktop/figure/"

# Input data
adc = "ADC_gene_set.plus.CPDM.tpm.csv"
# tcsa = "TCSA_gene_set.KRAS.CPDM.tpm.csv"

tpm.adc = read.csv(paste0("~/Desktop/figure/",adc), row.names = 1)
# tpm.tcsa = read.csv(paste0("~/Desktop/figure/",tcsa), row.names = 1)


# Define Functions  
# Function 1 
correlation_df = function(input_exp){
  input_exp = t(input_exp)
  # 스피어만 상관 계수 및 p-value 계산
  results <- WGCNA::cor(input_exp, use = "pairwise.complete.obs", method = "spearman")
  
  df = results %>% data.frame() # spearman correlation 
  return(df)
}

# Application 
input.adc = correlation_df(input_exp = tpm.adc)

#input.tcsa = correlation_df(input_exp = tpm.tcsa)


# Function 2 
correlation_heatmap = function(input.data, low=60, high=80){
  # Heatmap 
  my.color=c(colorRampPalette(colors = c("#2874A6","white"))(low),
             colorRampPalette(colors = c("white","#D35400"))(high))
  input.data %>% pheatmap::pheatmap(show_rownames = F, show_colnames = F, 
                            border_color = NA,
                            color = my.color,
                            main = "Correlation Heatmap")
}

# Function 2 version 2 
correlation_heatmap = function(input.data, low=60, high=80, 
                               showname= F, fontsize=5){
  # Heatmap 
  my.color=c(colorRampPalette(colors = c("#2874A6","white"))(low),
             colorRampPalette(colors = c("white","#D35400"))(high))
  input.data %>% pheatmap::pheatmap(show_rownames = showname, 
                                    show_colnames = showname, 
                                    fontsize = fontsize,
                                    border_color = NA,
                                    color = my.color,
                                    main = "Correlation Heatmap")
}


# Function 3 
# Create heatmap data frame with corresponding row/column order in heatmap 
correlation_heatmap_dataframe = function(input.data, low=60, high=80){
  # Heatmap 
  my.color=c(colorRampPalette(colors = c("#2874A6","white"))(low),
             colorRampPalette(colors = c("white","#D35400"))(high))
  df = input.data %>% pheatmap::pheatmap(show_rownames = F, show_colnames = F, 
                                    border_color = NA,
                                    color = my.color,
                                    main = "Correlation Heatmap")
  input.reordered = input.data[df$tree_row$order, df$tree_col$order]
  return(input.reordered)
}

Application

input.data = input.adc
# correlation_heatmap(input.data = input.data)
adc.hm.df = correlation_heatmap_dataframe(input.data = input.data)

round(adc.hm.df[1:10,1:10],3) %>% DT::datatable(caption = "Heatmap table")
correlation_heatmap(input.data = input.data)

# To save heatmap as pdf format 
dir <- "~/Desktop/figure/"
pdf(paste0(dir,"ADC_heatmap_with_names.pdf"), width = 10, height = 10)
correlation_heatmap(input.data = input.data, showname = T, fontsize = 5)
dev.off()


# To save heatmap data frame 
input.data = input.adc
adc.hm.df = correlation_heatmap_dataframe(input.data = input.data)
adc.hm.df %>% write.csv(paste0(dir,"cor_spearman_adc_heatmap_order.csv"))